here we’re analyszing bulk TCR-seq data from multiple timepoints of LCMV-Cl13 with and without aPDL1

load libraries

suppressPackageStartupMessages({
  library(tidyverse) 
  library(ComplexHeatmap)
  library(cowplot)
  library(ggpubr)
  library(readxl)
  library(RColorBrewer)
  library(MetBrewer)
  library(viridis)
  library(Hmisc)
  library(emmeans)
})
## Warning: package 'Hmisc' was built under R version 4.3.3

set directories & plot aesthetics

path <- "/Users/craposo/Desktop/Github Upload/9_LongtidunalAntiPDL1"

plot.theme <- theme_cowplot() + theme(plot.title = element_text(hjust = 0.5, face = "plain"), 
                                      strip.background = element_rect(fill = "transparent"), strip.text = element_text(size = 14)
                                      )

plot.theme.box <- theme_bw() + 
  theme(plot.title = element_text(hjust =  0.5 , size = 14, color = "black"),
        plot.subtitle = element_text(hjust = 0.5 , size = 10, color = "black"), 
        axis.text = element_text(color = "black", size = 12),
        axis.title = element_text(size = 12),
        axis.ticks = element_line(color = "black", linewidth = 0.7), 
        strip.background = element_blank(), strip.text = element_text(hjust = 0.5, size = 14),
        panel.border = element_rect(colour = "black", linewidth = 0.7),
        panel.grid = element_blank()
        )

color palettes

pop.pal <-  c(
  "Tcm" = "#65c7ca", 
  "Tex-KLR" = "#E27E73" ,
  "Tex-Int" = "#de311e", 
  "Tex-Term" = "#b31e0e", 
  "Tex-Prog" = "#a94d9a",
  "Tem_1" = "#f8a523",
  "Tem_2" = "#f6d13a"
  )

traj_pal <- c(brewer.pal(n = 8, name = "Paired"))
names(traj_pal) <- c(paste0("Traj_", 1:8))

group.pal <- c("PBS" = "#886231", "aPDL1" = "#299093")

import bTCR-seq data

# Specify the directory path and get all files in the directory
bTCR_datapath <- paste0(path, "/data/rawdata")

samples <- list.files(path = bTCR_datapath)

# read in all csvs and append sample ID
bTCR_raw_list <- lapply(samples, function(x){
  sample.id <- substr(x, 27, 32)
  path <- paste0(bTCR_datapath, "/TCRlongtiduinal_Processed_" , sample.id , "_pep.csv")
  data <- read.csv(path)
  data$Sample.id <- sample.id
  return(data)
}) 
  
# append all TCR raw data together
bTCR_rawdata <-  do.call(rbind, bTCR_raw_list)

# read in sample IDs and append to bTCR_rawdata
sample.id <- read.csv(paste0(path, "/data/SampleInfo.csv")) %>% select(Sample, Sample.id) 

bTCR_rawdata <- merge(bTCR_rawdata, sample.id, by = "Sample.id") %>% 
  mutate(CDR3.nuc.= toupper(CDR3.nuc.)) # make CDR3 all upper case to make futures things easier

bTCR_rawdata %>% head()
##   Sample.id     CDR3.pep.            V VRefBegin VRefEnd VReadBegin VReadEnd
## 1    341724    ASSPGTYEQY    mTRBV3*01       120     285          0      165
## 2    341724 ASRNRGINQDTQY mTRBV13-3*01        96     281          4      189
## 3    341724  ASSFGTGGTEVF   mTRBV14*01       103     288         12      197
## 4    341724    ASGDCNTEVF mTRBV13-2*04        91     279         14      202
## 5    341724 ASSLSGTSNTEVF mTRBV12-2*01        98     284          7      193
## 6    341724   ASSRQGSTEVF mTRBV12-2*01       103     283         17      197
##           D DRefBegin DRefEnd DReadBegin DReadEnd           J JRefBegin JRefEnd
## 1 mTRBD1*01         0       6        168      174 mTRBJ2-7*01         4      47
## 2 mTRBD1*01         3      10        191      198 mTRBJ2-5*01         0      49
## 3 mTRBD1*01         0       8        197      205 mTRBJ1-1*01         4      48
## 4         -         -       -          -        - mTRBJ1-1*01         2      48
## 5 mTRBD1*01         0       6        197      203 mTRBJ1-1*01         0      48
## 6 mTRBD1*01         2      10        197      205 mTRBJ1-1*01         4      48
##   JReadBegin JReadEnd         C CRefBegin CRefEnd CReadBegin CReadEnd
## 1        174      217 mTRBC2*03         0      31        217      248
## 2        202      251 mTRBC2*03         0      31        251      282
## 3        208      252 mTRBC2*03         0      31        252      283
## 4        205      251 mTRBC2*03         0      31        251      282
## 5        204      252 mTRBC2*03         0      31        252      283
## 6        207      251 mTRBC2*03         0      31        251      282
##                                                                                                                                                                                                                                                                                     joinedSeq
## 1                                    cagcagatggagtttctggttaatttctACAATGGTAAAGTCATGGAGAAGTCTAAACTGTTTAAGGATCAGTTTTCAGTTGAAAGACCAGATGGTTCATATTTCACTCTGAAAATCCAACCCACAGCACTGGAGGACTCAGCTGTGTACTTCTGTGCCAGCAGCCCCGGGACATATGAACAGTACTTCGGTCCCGGCACCAGGCTCACGGTTTTAGAGGATCTGAGAAATGTGACTCCacccaaggt
## 2  cttttactggtatcggcaggACACGGGGCACGGGCTGAGGCTGATTCATTACTCATATGTCGCTGACAGCACGGAGAAAGGAGATATCCCTGATGGGTACAAGGCCTCCAGACCAAGCCAAGAGAATTTCTCTCTCATTCTGGAGTTGGCTTCCCTTTCTCAGACAGCTGTATATTTCTGTGCCAGCAGGAACAGGGGAATTAACCAAGACACCCAGTACTTTGGGCCAGGCACTCGGCTCCtcgtgttagaggatctgagaaatgtgactccacccaaggt
## 3 ttccgatcttcgatcagcagcccagagACCAGGGGCCCCAGCTTCTAGTTTACTTTCGGGATGAGGCTGTTATAGATAATTCACAGTTGCCCTCGGATCGATTTTCTGCTGTGAGGCCTAAAGGAACTAACTCCACTCTCAAGATCCAGTCTGCAAAGCAGGGCGACACAGCCACCTATCTCTGTGCCAGCAGTTTCGGGACAGGAGGCACAGAAGTCTTCTTTGGTAAAGGAACCAGACTCACAGTTGTAGAGGATCTGAGAAATGTGACTCCACCCaaggt
## 4  tcttccgatctgggactggtatcggcaggACACGGGGCATGGGCTGAGGCTGATCCATTATTCATATGGTGCTGGCAGCACTGAGAAAGGAGATATCCCTGATGGATACAAGGCCTCCAGACCAAGCCAAGAGAACTTCTCCCTCATTCTGGAGTTGGCTACCCCCTCTCAGACATCAGTGTACTTCTGTGCCAGCGGTGATTGTAACACAGAAGTCTTCTTTGGTAAAGGAACCAGACTCACAGTTGTAGaggatctgagaaatgtgactccacccaaggt
## 5 ttccgatctggtatcaacagactcaggGGCAGGAACTAAAGTTCTTCATTCAGCATTATGATAAAATGGAGAGAGATAAAGGAAACCTGCCCAGCAGATTCTCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGTGCCAGCTCTCTTTCCGGGACATCAaacacagaagtcttctttggtaaaggaaccagactcacagttgtagaggatctgagaaatgtgactccacccaaggt
## 6  tgctcttccgatcttggatcaacagactcaggGGCAGGAACTAAAGTTCTTCATTCAGCATTATGATAAAATGGAGAGAGATAAAGGAAACCTGCCCAGCAGATTCTCAGTCCAACAGTTTGATGACTATCACTCTGAGATGAACATGAGTGCCTTGGAGCTAGAGGACTCTGCCGTGTACTTCTGTGCCAGCTCTCGACAGGGGAGCACAGAAGTCTTCTTTGGTAAAGGAACCAGACTCACAGTTGTAGAGGATCTGAGaaatgtgactccacccaaggt
##                                 CDR3.nuc. copy               Sample
## 1          GCCAGCAGCCCCGGGACATATGAACAGTAC    1 C103_d100_aPDL1_cTcm
## 2 GCCAGCAGGAACAGGGGAATTAACCAAGACACCCAGTAC    1 C103_d100_aPDL1_cTcm
## 3    GCCAGCAGTTTCGGGACAGGAGGCACAGAAGTCTTC    8 C103_d100_aPDL1_cTcm
## 4          GCCAGCGGTGATTGTAACACAGAAGTCTTC   11 C103_d100_aPDL1_cTcm
## 5 GCCAGCTCTCTTTCCGGGACATCAAACACAGAAGTCTTC   14 C103_d100_aPDL1_cTcm
## 6       GCCAGCTCTCGACAGGGGAGCACAGAAGTCTTC    1 C103_d100_aPDL1_cTcm
# check number of TCRs per sample
bTCR_rawdata %>% count(Sample) %>% ggplot(aes(x = Sample, y = n)) + geom_col() + scale_y_log10() + rotate_x_text(90)

# select only the cdr3 and count columns
cdr3.df <- bTCR_rawdata %>% dplyr::select(CDR3.nuc. , copy, Sample) %>%
  mutate(Rep = ifelse(substr(Sample , nchar(Sample) -1, nchar(Sample)) == "_r", "R2", "R1")) %>%  # add replicate column
  mutate(Sample = ifelse(Rep == "R2", substr(Sample, 1, nchar(Sample) -2), Sample))  # remove replicate from sample ID

check qc and filter the bTCR-seq data

# QC check on replicate samples
## everything looks highly concordant between replicates, the one exception is d21 prog - likely because just so few cells
cdr3.df %>% pivot_wider(id_cols = c(CDR3.nuc. , Sample), values_from = copy, names_from = Rep, values_fn = {sum}) %>%
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .))) %>%
  ggplot(aes(x = R1, y = R2)) + 
    geom_point(alpha = .5, size = 0.5) + 
    geom_smooth(method = "lm", color = "red", fill = "transparent") +
    facet_wrap(~Sample, scales = "free") + 
    stat_cor() + scale_x_log10() + scale_y_log10() + theme_bw() + ggtitle("all clones")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 218962 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 218962 rows containing non-finite values (`stat_cor()`).

# average clone clounts across replicates
## make vector only where clone is in both replicates
both.rep.clones <- cdr3.df %>% pivot_wider(id_cols = c(CDR3.nuc. , Sample), values_from = copy, names_from = Rep, values_fn = {sum}) %>%
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .))) %>% 
  filter(R1 >0, R2 > 0) %>% 
  dplyr::select(CDR3.nuc.) %>% c()

## create new clone.df with average reads per sample
clone.df <- cdr3.df %>% filter(Sample != "PosCtrl_m20210506", CDR3.nuc. %in% both.rep.clones$CDR3.nuc.) %>% # remove postive control, select only shared clones
  group_by(Rep, Sample, CDR3.nuc.) %>% mutate(ReadFrac = sum (copy)) %>% ungroup() %>% # any TCRs with the same CDR3 add together
  group_by(Rep, Sample) %>% mutate(ReadFrac = copy / sum (copy)) %>% # convert copies to fraction of reads
  ungroup() %>% group_by(Sample, CDR3.nuc.) %>% dplyr::select(-c(Rep, copy)) %>% summarise(ReadFrac = mean(ReadFrac)) %>% # average read fraction between the two reads
  dplyr::rename(TRB_CDR3 = "CDR3.nuc.")
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
# add in cell numbers from sort - adjust the read fractions to cell # 
sampleinfo <- read_xlsx(paste0(path, "/data/CellNumbersSorted.xlsx")) %>%
  mutate(Sample = str_replace_all(Sample, " ", "_")) %>% dplyr::select("Sample", "ActualCellNumbner_FlowData", "SortedCellNumber") %>%
  dplyr::rename(TotalCellNumber = "ActualCellNumbner_FlowData")

clone.df <- merge(clone.df, sampleinfo, by = "Sample") %>% 
  mutate(TotalNumber = ReadFrac * TotalCellNumber, SortedNumber = ReadFrac * SortedCellNumber) %>% 
  ungroup()

# separate DF for all phnotype sorting vs tet sorting
clone.df.pheno <- clone.df %>% filter(Sample != "C103_GP33neg",Sample != "C103_GP33pos") %>% 
  ### mutate sample ID into more useful metadata
  separate(Sample, into = c('experiment', 'dpi', 'group', 'sort')) %>%  
  mutate(TRB_CDR3 = paste(group, TRB_CDR3, sep = "_")) %>%
  ### TotalNumberClone is number of cells per SPL or per 100uL - this is for determining the clonal behavior
  ### SortedNumberClone is the number of cells actually sorted - this is for determining how deeply we actually sequenced
  group_by(TRB_CDR3, dpi, group) %>% mutate(TotalNumberClone = sum(TotalNumber), SortedNumberClone = sum(SortedNumber)) %>% ungroup() %>%
  ### CloneFreq is a given clones frequemcy compared to all other clones in the timepoint
  group_by(dpi, group) %>% mutate(CloneFreq = TotalNumberClone/sum(TotalNumberClone)) %>% ungroup() %>%
  ### remove cell number info from sorting data, leave only clones #s
  dplyr::select(-c(TotalCellNumber, SortedCellNumber))

# rename cells by new names for celltypes
clone.df.pheno <- clone.df.pheno %>% 
  mutate(sort = case_when(
    sort == "cTcm" ~ "Tcm", 
    sort == "Prog"  ~ "Tex-Prog", 
    sort == "Term"  ~ "Tex-Term", 
    sort == "Int" & dpi == "d100" ~ "Tem_2", 
    sort == "KLR" & dpi == "d100" ~ "Tem_1", 
    sort == "Int" & dpi != "d100" ~ "Tex-Int", 
    sort == "KLR" & dpi != "d100" ~ "Tex-KLR"
  ))
  
# check clones per phenotype
table(clone.df.pheno$sort, clone.df.pheno$dpi, clone.df.pheno$group)
## , ,  = aPDL1
## 
##           
##            d100  d21  d35
##   Tcm      1542    0    0
##   Tem_1    1524    0    0
##   Tem_2    1032    0    0
##   Tex-Int     0  363  989
##   Tex-KLR     0  289  785
##   Tex-Prog 1002  154 1069
##   Tex-Term  485    0    0
## 
## , ,  = Ctrl
## 
##           
##            d100  d21  d35
##   Tcm      7552    0    0
##   Tem_1    1485    0    0
##   Tem_2     980    0    0
##   Tex-Int     0  439  867
##   Tex-KLR     0  352  776
##   Tex-Prog  786  141  178
##   Tex-Term  456    0    0

set vectors of clones that can be traced between chronic infection and after

we’re not requiring that a clone be detected at both d21 and d35 for two reasons

(1) de novo primed clones not present before aPDL1 may expand with aPDL1 and joim the response

(2) in each group, there was one mouse that we were unable to get a good bleed on day 35 so we did not include cells from those mice in this dataset

# visualize clone sizes
clone.df.pheno %>% ggplot(aes(x = SortedNumberClone, y = ReadFrac)) + 
  geom_point(alpha = 0.1) + facet_wrap(group~dpi) + 
  scale_x_log10() + scale_y_log10() + geom_vline(xintercept = 10, color = "red", linetype = "dashed") 

# isolate clones detected at d100 and also at at least one timepoint in the blood
## blood d21 and 35
d21.clones <- clone.df.pheno %>% filter(SortedNumberClone >= 10, dpi == "d21")
d35.clones <- clone.df.pheno %>% filter(SortedNumberClone >= 10, dpi == "d35")
blood.clones <- c(d21.clones$TRB_CDR3, d35.clones$TRB_CDR3) %>% unique()

## spleen d100+
d100.clones <- clone.df.pheno %>% filter(SortedNumberClone >= 10, dpi == "d100")

# set clones of interest as those found in the blood and the spleen 
clones.of.interest <-  intersect(blood.clones, d100.clones$TRB_CDR3)

look at clone behaviors at individual timepoints - check that data look like we’d expect from previous experiments

# day 21 ==============================================================
clone.freq.d21 <- clone.df.pheno %>%
  filter(TRB_CDR3 %in% clones.of.interest) %>%
  mutate(freq = (TotalNumber/TotalNumberClone)) %>% 
  dplyr::select(sort, dpi, TRB_CDR3, freq, CloneFreq) %>%
  filter(dpi == "d21") 
  
wide <- clone.freq.d21 %>% 
  pivot_wider(names_from = sort, values_from = freq) %>% 
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .))) %>% 
  filter(TRB_CDR3 %in% clones.of.interest) %>%
  separate(TRB_CDR3, into = c("Group", "TRB_CDR3"))

kmeans <- wide %>% select(-c(dpi, TRB_CDR3, Group, CloneFreq, Group)) %>% kmeans(centers = 4)

wide$Cluster <- kmeans$cluster

wide %>% ungroup() %>% select(-c(dpi, TRB_CDR3,Group, CloneFreq, Cluster)) %>% 
  Heatmap(split = wide$Cluster, col = brewer.pal(9, "Blues"), border = T, column_title = "Day 21")
## Warning: The input is a data frame-like object, convert it to a matrix.

# day 35 ==============================================================
clone.freq.d35 <- clone.df.pheno %>%
  filter(TRB_CDR3 %in% clones.of.interest) %>%
  mutate(freq = (TotalNumber/TotalNumberClone)) %>% 
  dplyr::select(sort, dpi, TRB_CDR3, freq, CloneFreq) %>%
  filter(dpi == "d35") 
  
wide <- clone.freq.d35 %>% 
  pivot_wider(names_from = sort, values_from = freq) %>% 
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .))) %>% 
  filter(TRB_CDR3 %in% clones.of.interest) %>%
  separate(TRB_CDR3, into = c("Group", "TRB_CDR3")) 

kmeans <- wide %>% select(-c(dpi, TRB_CDR3, Group, CloneFreq, Group)) %>% kmeans(centers = 4)

wide$Cluster <- kmeans$cluster

wide %>% ungroup() %>% select(-c(dpi, TRB_CDR3,Group, CloneFreq, Cluster)) %>% 
  Heatmap(split = wide$Cluster, col = brewer.pal(9, "Blues"), border = T, column_title = "Day 35" , name = "freq of clone") 
## Warning: The input is a data frame-like object, convert it to a matrix.

# day 100 ==============================================================

clone.freq.d100 <- clone.df.pheno %>%
  filter(TRB_CDR3 %in% clones.of.interest) %>%
  mutate(freq = (TotalNumber/TotalNumberClone)) %>% 
  dplyr::select(sort, dpi, TRB_CDR3, freq, CloneFreq) %>%
  filter(dpi == "d100") 
  
wide <- clone.freq.d100 %>% 
  pivot_wider(names_from = sort, values_from = freq) %>% 
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .))) %>% 
  filter(TRB_CDR3 %in% clones.of.interest) %>%
  separate(TRB_CDR3, into = c("Group", "TRB_CDR3")) 

kmeans <- wide %>% select(-c(dpi, TRB_CDR3, Group, CloneFreq, Group)) %>% kmeans(centers = 7)

wide$Cluster <- kmeans$cluster

wide %>% ungroup() %>% select(-c(dpi, TRB_CDR3,Group, CloneFreq, Cluster)) %>% 
  Heatmap(split = wide$Cluster, col = brewer.pal(9, "Blues"), border = T, column_title = "Day 100+", name = "freq of clone") 
## Warning: The input is a data frame-like object, convert it to a matrix.

#### all Timpoints =============================================

all.tp.wide <- rbind(clone.freq.d21, clone.freq.d35) %>%
  rbind(clone.freq.d100) %>%
  mutate(sort = paste(dpi,sort, sep = "___")) %>% 
  select(-c(CloneFreq, dpi)) %>%
  pivot_wider(names_from = sort, values_from = freq) %>%
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .))) 

# heatmap of clones across TP
all.tp.wide %>% ungroup() %>% select(-c(TRB_CDR3)) %>% 
  Heatmap(col = brewer.pal(9, "Blues"), border = T, cluster_columns = F, show_row_dend = F, name = "Freq of clone at timepoint") 
## Warning: The input is a data frame-like object, convert it to a matrix.

# check to make sure correlation between subsets
all.tp.wide %>% ungroup() %>% select(-c(TRB_CDR3)) %>% cor() %>% Heatmap(name = "Pearson R", column_title = "Correlation of Clone Freq", col = brewer.pal(9, "RdYlBu") %>% rev, border = T)

Determine longitudinal clonal trajectories

to do this, adapt cyclone (from Alex Huang https://doi.org/10.1016/j.ccell.2024.08.007) - but add phenotype frequencies, so instead of calcualtion correlations of clone size at each timepoiint, we calcualtion correlation on a matrix of clone frequecny in a given phenotype (of total cells)

# clone frequency - clone and phenotype as proportion of all cells at that timepoint
clone.freq.longitudinal <- clone.df.pheno %>%  
  mutate(freq = (TotalNumber/TotalNumberClone)*CloneFreq) %>% dplyr::select(sort, dpi, TRB_CDR3, freq, CloneFreq, CloneFreq) %>% 
  mutate(name = paste(dpi, sort, sep = "--")) %>% 
  dplyr::select(TRB_CDR3, name, freq)%>% pivot_wider(values_from = freq) %>%
  mutate(across(everything(), ~ ifelse(is.na(.), 0, .))) %>% 
  filter(TRB_CDR3 %in% clones.of.interest)

head(clone.freq.longitudinal)
## # A tibble: 6 × 12
##   TRB_CDR3              `d100--Tcm` `d100--Tem_2` `d100--Tem_1` `d100--Tex-Prog`
##   <chr>                       <dbl>         <dbl>         <dbl>            <dbl>
## 1 aPDL1_GCCAGCAATAGTCA…   0.000181      0            0.0000605        0         
## 2 aPDL1_GCCAGCACACAGGG…   0.0000320     0            0.00000962       0         
## 3 aPDL1_GCCAGCACCATAGA…   0.00114       0.00151      0.00209          0.000143  
## 4 aPDL1_GCCAGCAGACATGC…   0.000153      0.000225     0.0000177        0.00000846
## 5 aPDL1_GCCAGCAGATATGG…   0.000216      0.000150     0.0000466        0.000102  
## 6 aPDL1_GCCAGCAGATCCGG…   0.000674      0.0000581    0.000165         0.0000367 
## # ℹ 7 more variables: `d100--Tex-Term` <dbl>, `d21--Tex-Int` <dbl>,
## #   `d21--Tex-KLR` <dbl>, `d21--Tex-Prog` <dbl>, `d35--Tex-Int` <dbl>,
## #   `d35--Tex-KLR` <dbl>, `d35--Tex-Prog` <dbl>
# Transpose the desired data for correlation
df_transposed <- dplyr::select(clone.freq.longitudinal, -TRB_CDR3) %>% t

# Calculate the correlation matrix
correlation_matrix <- cor(df_transposed, use = "pairwise.complete.obs", method = "pearson")

# Convert correlation to distance matrix and perform hierarchical clustering
dist_matrix <- as.dist(1 - correlation_matrix)
hc <- hclust(dist_matrix, method = "complete")
plot(hc) # Plot dendrogram

# Cut tree into k clusters
set.seed(23)
clusters <- cutree(hc, k=8)

# add behavior info to clone.freq.longitudinal
clone.freq.longitudinal$Behavior <- paste0("T_" , clusters)

# # rename behaviors - group by pattern of expansion or contraction
clone.freq.longitudinal <- clone.freq.longitudinal %>% mutate(Behavior = case_when(
    Behavior == "T_1"  ~ "Traj_1", # contracting
    Behavior == "T_5"  ~ "Traj_2",
    Behavior == "T_8"  ~ "Traj_3",
    Behavior == "T_7"  ~ "Traj_4",
    Behavior == "T_4"  ~ "Traj_5", # expand then contract
    Behavior == "T_6"  ~ "Traj_6",
    Behavior == "T_2"  ~ "Traj_7", # expand whole time
    Behavior == "T_3"  ~ "Traj_8"
  ))

# mutate metadata of clone.freq.longitudinal
clone.freq.longitudinal <- clone.freq.longitudinal %>% separate(TRB_CDR3, into = c("group", "CDR3")) %>%
  mutate(group = ifelse(group ==  "Ctrl" , "PBS", "aPDL1")) %>%
  mutate(group = factor(group, levels = c("PBS", "aPDL1")))

# plot heatmap with clusters
hm <- Heatmap(correlation_matrix, 
    name = "Pearson R",
    column_title = "Pairwise Clone Correlations",
    # col = viridis(9),
    col = brewer.pal(9, "RdBu") %>% rev(),
    show_row_names = FALSE,show_column_names = FALSE,
    cluster_rows = TRUE, cluster_columns = TRUE, show_row_dend = F, show_column_dend = F,
    left_annotation = rowAnnotation(` `  = clone.freq.longitudinal$Behavior, show_legend = F, col = list (` ` = traj_pal)),
    top_annotation = columnAnnotation(` `  = clone.freq.longitudinal$Behavior, show_legend = F, col = list (` ` = traj_pal)),
    column_split = clone.freq.longitudinal$Behavior, row_split = clone.freq.longitudinal$Behavior, 
    row_title_rot = 0,
    border = FALSE, row_gap = unit(0, "cm"), column_gap = unit(0, "cm"),
    height = unit(3, "in"), width = unit(3, "in") 
    # use_raster = T
)

hm

# freq of clone behaviors per group

clone.freq.longitudinal %>%
  count(Behavior, group)  %>% 
  group_by(group) %>% mutate(n = n/sum(n))  %>%
  ggplot(aes(x = group, y = n*100, fill = Behavior)) + 
  geom_col(show.legend = T) + 
  plot.theme + 
  scale_y_continuous(expand = expansion(mult = c(0.0, 0.05))) + 
  labs(x = "Group", y = "% of Clones") + 
  scale_fill_manual(values = traj_pal)  + rotate_x_text(45)

summarize clone trajectory phenotype distribution

# summary plots for clone behaviors
data_long <- clone.freq.longitudinal %>%
  pivot_longer(cols = -c(group, CDR3, Behavior),
               names_to = c("Day", "Sort"),
               names_sep = "--",
               values_to = "Value")

# summarize behavior per timepoint
data_summary <- data_long %>%
  group_by(Behavior, Day, Sort) %>%
  summarise(Average_Value = mean(Value, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Behavior', 'Day'. You can override using
## the `.groups` argument.
data_summary %>% group_by(Day, Behavior) %>% mutate(n = Average_Value/sum(Average_Value)) %>% select(-Average_Value) %>%pivot_wider(names_from = Sort, values_from = n)
## # A tibble: 24 × 9
## # Groups:   Day, Behavior [24]
##    Behavior Day      Tcm  Tem_1   Tem_2 `Tex-Prog` `Tex-Term` `Tex-Int`
##    <chr>    <chr>  <dbl>  <dbl>   <dbl>      <dbl>      <dbl>     <dbl>
##  1 Traj_1   d100   0.317  0.502  0.152      0.0239    0.00501    NA    
##  2 Traj_1   d21   NA     NA     NA          0.0603   NA           0.207
##  3 Traj_1   d35   NA     NA     NA          0.0877   NA           0.377
##  4 Traj_2   d100   0.212  0.449  0.234      0.0507    0.0537     NA    
##  5 Traj_2   d21   NA     NA     NA          0.0228   NA           0.919
##  6 Traj_2   d35   NA     NA     NA          0.0850   NA           0.516
##  7 Traj_3   d100   0.716  0.160  0.0893     0.0238    0.0112     NA    
##  8 Traj_3   d21   NA     NA     NA          0.255    NA           0.639
##  9 Traj_3   d35   NA     NA     NA          0.305    NA           0.354
## 10 Traj_4   d100   0.580  0.165  0.0834     0.114     0.0574     NA    
## # ℹ 14 more rows
## # ℹ 1 more variable: `Tex-KLR` <dbl>
data_summary %>% group_by(Day, Behavior) %>% pivot_wider(names_from = Sort, values_from = Average_Value)
## # A tibble: 24 × 9
## # Groups:   Day, Behavior [24]
##    Behavior Day          Tcm      Tem_1    Tem_2 `Tex-Prog` `Tex-Term` `Tex-Int`
##    <chr>    <chr>      <dbl>      <dbl>    <dbl>      <dbl>      <dbl>     <dbl>
##  1 Traj_1   d100   0.0000799  0.000126   3.83e-5 0.00000603    1.26e-6 NA       
##  2 Traj_1   d21   NA         NA         NA       0.0000842    NA        0.000290
##  3 Traj_1   d35   NA         NA         NA       0.0000368    NA        0.000158
##  4 Traj_2   d100   0.0000345  0.0000731  3.81e-5 0.00000825    8.73e-6 NA       
##  5 Traj_2   d21   NA         NA         NA       0.0000137    NA        0.000552
##  6 Traj_2   d35   NA         NA         NA       0.0000261    NA        0.000159
##  7 Traj_3   d100   0.000197   0.0000442  2.47e-5 0.00000656    3.08e-6 NA       
##  8 Traj_3   d21   NA         NA         NA       0.000195     NA        0.000490
##  9 Traj_3   d35   NA         NA         NA       0.000109     NA        0.000127
## 10 Traj_4   d100   0.0000609  0.0000173  8.76e-6 0.0000120     6.03e-6 NA       
## # ℹ 14 more rows
## # ℹ 1 more variable: `Tex-KLR` <dbl>
# plot phenotypes per trajectory 
plot.pheno <- data_summary  %>% 
  mutate(Day = case_when(
    Day == "d21" ~ "21 dpi", 
    Day == "d35"  ~ "35 dpi", 
    Day == "d100"  ~ "100+ dpi")) %>% 
  ggplot(aes(x = Day, y = Average_Value, fill = Sort, group = Sort)) + 
  geom_col() + 
  facet_wrap(~Behavior, ncol = 4, scales = "free") + 
  scale_x_discrete(limits = c("21 dpi", "35 dpi", "100+ dpi")) + 
  scale_fill_manual(values = pop.pal) + 
  plot.theme + rotate_x_text(45) + labs(x = "Timepoint", y = "Freq of Cells (Avg. per Clone)") +
  scale_y_continuous(expand = expansion(mult = c(0.0, 0.05)), labels = function(x) format(x, scientific = TRUE)) + 
  theme(legend.position = "none")



plot.pheno

plot clone frequencies for trejctories

# new dataframe to hold longitudinal data
df_totalfreq <- clone.freq.longitudinal %>%
  rowwise() %>%
  mutate(
    d21Prop = sum(c_across(starts_with("d21"))),
    d35Prop = sum(c_across(starts_with("d35"))),
    d100Prop = sum(c_across(starts_with("d100")))
  ) %>%
  ungroup()

# Step 2: Reshape data from wide to long
df_long <- df_totalfreq %>%
  select(group, CDR3, Behavior, d21Prop, d35Prop, d100Prop) %>%
  pivot_longer(
    cols = c(d21Prop, d35Prop, d100Prop),
    names_to = "TimePoint",
    values_to = "Proportion"
  ) %>%
  mutate(TimePoint = factor(TimePoint, levels = c("d21Prop", "d35Prop", "d100Prop")))

# plot all clones sizes
ggplot(data = df_long) +
  # Add individual lines with low alpha
  geom_line(aes(x = TimePoint, y = Proportion, group = interaction(CDR3)), alpha = 0.1) +
  facet_wrap(~ Behavior, ncol = 4) +
  labs(x = "Timepoint", y = "Freq of Cells (Log Scale)") +
  scale_x_discrete(labels = c("21 dpi", "35 dpi", "100+ dpi")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  scale_color_manual(values = c(group.pal), name = "Group") + 
  scale_y_log10() 
## Warning: Transformation introduced infinite values in continuous y-axis

# plot average clone size per group clone sizes

df_avg <- df_long %>%
  group_by(TimePoint, Behavior) %>%
  summarise(AvgProportion = mean(Proportion), sem = sd(Proportion)/sqrt(length(Proportion))) %>% 
  mutate(trend = ifelse(Behavior %in% c(paste0("Traj_", 1:4)),"Contracting", "Expanding"))
## `summarise()` has grouped output by 'TimePoint'. You can override using the
## `.groups` argument.
df_avg  %>%
  ggplot(aes(x = TimePoint, color = Behavior, y = AvgProportion)) + 
  geom_errorbar(aes(ymin = AvgProportion - sem, ymax = AvgProportion + sem), width = 0.05) + 
   geom_line(aes(group = Behavior)) +
  geom_point(size = 1.5) + 
  facet_wrap(~trend, scales = "free") + 
  ylim(0,0.0016) +
  plot.theme + 
  scale_x_discrete(labels = c("21 dpi", "35 dpi", "100+ dpi")) + labs(x = "Timepoint", y = "Freq of Cells", color = "Trajectory") + 
  scale_color_manual(values = brewer.pal(10, "Paired")) + rotate_x_text(45) + 
  theme(legend.position = "right")

clones biased towards Tcm - what clone behavior do they come from?

determine 100+ clone behaviors with hypergeometric test

# create new dataframe that shows intracelonal phenotype per timepont
all.tp.withclusters <- merge(x = clone.freq.longitudinal %>% select(group, CDR3, Behavior), 
                             y = all.tp.wide %>% separate(TRB_CDR3, into = c("group", "CDR3")) %>% # from above chunk looking at individual TP
                               mutate(group = ifelse(group ==  "Ctrl" , "PBS", "aPDL1")) %>% 
                               mutate(group = factor(group, levels = c("PBS", "aPDL1")))) 

# select only 100+ dpi phenotype
d100.clones <- all.tp.withclusters %>% select(group, CDR3, Behavior, d100___Tcm, d100___Tem_2, d100___Tem_1, `d100___Tex-Prog` , `d100___Tex-Term`) %>% 
  rename(Long_Traj = Behavior, 
         Tcm = d100___Tcm, 
         Tem_2 = d100___Tem_2, 
         Tem_1 = d100___Tem_1, 
         `Tex-Prog` = `d100___Tex-Prog` , 
         `Tex-Term` = `d100___Tex-Term`)

# look at which clones form what at d100+
d100.clones %>% pivot_longer(-c(group, CDR3,Long_Traj)) %>% 
  group_by(name, Long_Traj, group) %>% mutate(freqency = mean(value), nclones = length(CDR3)) %>% 
  ggplot(aes(x = name, y = Long_Traj, size = nclones, color = freqency)) + geom_point() + facet_wrap(~group) + 
  plot.theme.box + rotate_x_text(45) + scale_color_distiller(palette = "Reds", direction = "rev")

d100.clones %>% select(-c(CDR3, group, Long_Traj)) %>% 
  Heatmap(row_split = d100.clones$Long_Traj, 
          col = brewer.pal(9, "Purples"), border = T, row_title_rot = 0, name = "% of clone")  
## Warning: The input is a data frame-like object, convert it to a matrix.

# Function to calculate fate bias significance by hypergeometric test
calculate_fate_bias_significance <- function(M, m, N, n) {
  # Perform one-sided hypergeometric test
  p_value <- phyper(m - 1, n, N - n, M, lower.tail = FALSE)
  # Example usage:
  # M: total cells in differentiated states for this clone
  # m: cells in the fate cluster of interest for this clone
  # N: total cells in differentiated states across all clones
  # n: total cells in the fate cluster of interest across all clones
  return(p_value)
}

# get data from each clone - add proper values for M m N n
bias.df <-
  clone.freq.longitudinal %>% pivot_longer(cols = -c(group,CDR3, Behavior), names_to = "Pop", values_to = "n") %>%
  filter(grepl("d100", Pop)) %>%
  mutate(n = n*1e6) %>%
  group_by(Pop) %>% mutate(Total_Count_Sort = sum(n)) %>% ungroup() %>% # add total cells per sort across clones (n)
  group_by(group,CDR3) %>% mutate(Total_Count_Clone = sum(n))  %>% ungroup() %>% # add total cells per clone (M)
  mutate(Total_Count_Total = sum(n)) %>% # add the total number of cells (N)
  dplyr::rename(N = Total_Count_Total, M = Total_Count_Clone, n = Total_Count_Sort, m = n)

# calulte one-sided hypergeometric test
bias.df$p_value <- mapply(calculate_fate_bias_significance, bias.df$M, bias.df$m, bias.df$N, bias.df$n)
bias.df$p_value_adj <-  p.adjust(bias.df$p_value , method = "hochberg")



# look on a heatmap & compare phenotype prop vs. hypergeometic
hm.prop <- bias.df %>% 
  pivot_wider(names_from = Pop, values_from = p_value_adj, id_cols = c(group, CDR3, Behavior), values_fill = 1) %>% 
  merge(d100.clones) %>%
  select("d100--Tcm", "d100--Tem_2", "d100--Tem_1", "d100--Tex-Prog", "d100--Tex-Term") %>% Heatmap(name = "P adj",  col = viridis(9), border = T)
## Warning: The input is a data frame-like object, convert it to a matrix.
hm.hyper <- bias.df %>% 
  pivot_wider(names_from = Pop, values_from = p_value_adj, id_cols = c(group, CDR3, Behavior), values_fill = 1) %>% 
  merge(d100.clones) %>%
  select("Tcm", "Tem_2", "Tem_1", "Tex-Prog", "Tex-Term") %>% Heatmap(name = "Fraction of Clone",  col = brewer.pal(9, "Blues"), border = T)
## Warning: The input is a data frame-like object, convert it to a matrix.
hm.hyper + hm.prop

# select lowest P vlaue per clone and say a clone is biased towards a phenotype based on lowest P value that is sig
bias.wide <- bias.df %>% 
  pivot_wider(names_from = Pop, values_from = p_value_adj, id_cols = c(group, CDR3, Behavior), values_fill = 1) 

bias.wide <- bias.wide %>% select(c("d100--Tcm" ,"d100--Tem_2", "d100--Tem_1","d100--Tex-Prog", "d100--Tex-Term")) %>%
  rowwise() %>%  # Enable row-wise operations
  mutate(
    pval = min(c_across(contains("d100")), na.rm = TRUE),  # Find the minimum value across "d100" columns
    min_p = names(.)[which.min(c_across(contains("d100")))]  # Find the name of the column with the minimum value
  ) %>%
  mutate(min_p = ifelse(pval < 0.05, min_p, "ns")) %>%
  cbind(bias.wide %>% select(group, CDR3, Behavior)) %>% 
  mutate(min_p = case_when(
    min_p == "d100--Tcm" ~ "Tcm", 
    min_p == "d100--Tem_2"  ~ "Tem_2", 
    min_p == "d100--Tem_1"  ~ "Tem_1",
    min_p == "d100--Tex-Prog"  ~ "Tex-Prog", 
    min_p == "d100--Tex-Term"  ~ "Tex-Term", 
    min_p == "ns" ~ "No Bias")
)

# plot clones based on hypergeometic enrichment and longitudinal trajectory
bias.wide %>% count(group, Behavior, min_p) %>% 
  group_by(min_p) %>% mutate(order = sum(n)) %>%
  ggplot(aes(x = fct_reorder(min_p , order), y = n, fill=Behavior)) + 
  geom_col(show.legend = F) + 
  facet_wrap(~group, scales = "free") + 
  plot.theme + rotate_x_text(45) + 
  scale_fill_manual(values = traj_pal) + 
  labs(x = "Clonal Bias 100+ dpi", y = "# of clones") + 
  scale_y_continuous(expand = expansion(mult = c(0.0, 0.05))) 

logistic regression to determine if clone trajectories are different per group

# set up dataframe logistic regression ===============================
regress.df <-  clone.freq.longitudinal %>% 
  mutate(group = ifelse(group == "PBS", 0, 1)) # set group = 1 if clone is in the aPDL1 group

# Run logistic regression model with Clone_Behavior as predictor
model <- glm(group ~ Behavior, data = regress.df, family = binomial)
summary(model)
## 
## Call:
## glm(formula = group ~ Behavior, family = binomial, data = regress.df)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.02553    0.09226   0.277  0.78197    
## BehaviorTraj_2 -0.47612    0.14528  -3.277  0.00105 ** 
## BehaviorTraj_3  0.72824    0.43856   1.661  0.09681 .  
## BehaviorTraj_4 -0.21742    0.20884  -1.041  0.29784    
## BehaviorTraj_5  0.22842    0.12063   1.894  0.05828 .  
## BehaviorTraj_6 -0.94870    0.23810  -3.984 6.77e-05 ***
## BehaviorTraj_7  1.37033    0.23769   5.765 8.16e-09 ***
## BehaviorTraj_8  0.37993    0.31797   1.195  0.23214    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2625.2  on 1894  degrees of freedom
## Residual deviance: 2522.8  on 1887  degrees of freedom
## AIC: 2538.8
## 
## Number of Fisher Scoring iterations: 4
# Create new data frame with unique values of Behaviors and Generate predictions based on Behavior
new_data <- data.frame(Behavior = unique(regress.df$Behavior)) 
predictions <- predict(model, new_data, type = "response", se.fit = TRUE)

predictions
## $fit
##         1         2         3         4         5         6         7         8 
## 0.5063830 0.8015267 0.6000000 0.5631501 0.3892216 0.2843137 0.4521739 0.6800000 
## 
## $se.fit
##          1          2          3          4          5          6          7 
## 0.02306140 0.03484771 0.07302967 0.01911924 0.02667885 0.04466423 0.04641146 
##          8 
## 0.09329520 
## 
## $residual.scale
## [1] 1
# Store predictions and standard errors in the new data frame
new_data$fit <- predictions$fit
new_data$se.fit <- predictions$se.fit

# plot comparisons
ggplot(data = new_data, aes(x = fct_reorder(Behavior, fit), y = fit, color = Behavior)) +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "grey80") +
  geom_point(size = 2.5) + 
  geom_errorbar(aes(ymin = fit - 1.96 * se.fit, ymax = fit + 1.96 * se.fit, width = 0.2), width = 0.25) + # 95% confidence interval
  plot.theme +
  # ylim(NA, max_y * 1.4) +
  ylab("Predicted Enrichment in aPDL1 vs Ctrl") +
  xlab("Longitundinal Clone Trajectory") +
  theme(legend.position = "none") + 
  facet_wrap(~" ") +
  scale_y_continuous(limits = c(0,1.01), breaks = c(0,0.5,1)) + 
  scale_color_manual(values = brewer.pal(10, "Paired")) + rotate_x_text(45)

check overlap with day 7 clones

we sorted clones from these mice at day based on GP33 tetramer to determine clones that were GP33+/-, but this timepoint also served to tell us if clones were present early after infection

clones.d7 <- clone.df %>% filter(Sample %in% c("C103_GP33neg", "C103_GP33pos"))

clone.freq.longitudinal %>% mutate(early = ifelse(CDR3 %in% clones.d7$TRB_CDR3, "early", "novel")) %>%
  count(group, Behavior, early) %>% 
  group_by(Behavior, group) %>% mutate(n=n/sum(n)) %>%
  ggplot(aes(x = Behavior, y = n, fill = early)) + geom_col() + facet_wrap(~group) + rotate_x_text()

clone.freq.longitudinal %>% mutate(early = ifelse(CDR3 %in% clones.d7$TRB_CDR3, "early", "novel")) %>%
  count(group, Behavior, early) %>% 
  # group_by(Behavior, group) %>% mutate(n=n/sum(n)) %>%
  ggplot(aes(x = Behavior, y = n, fill = early)) + geom_col() + facet_wrap(~group) + rotate_x_text()